1.Medal Counts over Time
setwd("/Users/luqi/Desktop")
ath = read.csv("athletes_and_events.csv")
nocregions = read.csv("noc_regions.csv")
gdppop = read.csv("gdp_pop.csv")
#install.packages("tidyverse")
athsummer <- ath[which(ath$Season=='Summer'),]
library(tidyr)
tathsummer = drop_na(athsummer, Medal)
tathsummer= tathsummer[!duplicated(tathsummer), ]
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#group_by(athsummer,NOC, add = FALSE)
cleandata <- tathsummer %>%
group_by(NOC) %>%
summarize(medal_number = length(Medal),gold_number = length(which(Medal=="Gold")),silver_number = length(which(Medal=="Silver")),bronze_number = length(which(Medal=="Bronze"))) %>%
arrange(desc(.$medal_number))
#cleandata
tathsummerUSA <- ath[which(tathsummer$NOC=='USA'),]
cleandataUSA <- tathsummerUSA %>%
group_by(Year,Sex) %>%
summarize(gold_number = length(which(Medal=="Gold")),silver_number = length(which(Medal=="Silver")),bronze_number = length(which(Medal=="Bronze")))
cleandataUSA
## # A tibble: 65 x 5
## # Groups: Year [35]
## Year Sex gold_number silver_number bronze_number
## <int> <fct> <int> <int> <int>
## 1 1896 M 0 0 0
## 2 1900 M 4 8 2
## 3 1904 M 1 3 1
## 4 1906 M 1 1 3
## 5 1908 F 0 0 0
## 6 1908 M 5 3 4
## 7 1912 M 10 8 10
## 8 1920 F 0 0 1
## 9 1920 M 6 8 8
## 10 1924 F 0 0 0
## # … with 55 more rows
library(ggplot2)
# ggplot(data=cleandataUSA,
# aes(x=Year,
# y=gold_number))+
# geom_line(aes(group=Sex,
# color=Sex))
# look at USA's gold medal number by time trend, considering the gneder of the gold medal winners
p = ggplot(data=cleandataUSA,
aes(x=Year,
y=gold_number))
p + geom_line(aes(group=Sex,
color=Sex))+
labs(x="Year",
y="Gold Medal Number",
color="Gender") +
ggtitle("USA's Gold Medal Number by Sex")+ geom_point(size=2,aes(color=Sex))

cleandata_add_Medal <- tathsummer %>%
group_by(NOC,Medal) %>%
summarize(medal_number = length(Medal))
#cleandata_add_Medal
cleandata_add_Medal_top5 <- cleandata_add_Medal[which(cleandata_add_Medal$NOC == 'USA'|cleandata_add_Medal$NOC =='URS'|cleandata_add_Medal$NOC =='GBR'|cleandata_add_Medal$NOC =='GER'|cleandata_add_Medal$NOC =='FRA'),]
cleandata_add_Medal_top5
## # A tibble: 15 x 3
## # Groups: NOC [5]
## NOC Medal medal_number
## <fct> <fct> <int>
## 1 FRA Bronze 587
## 2 FRA Gold 463
## 3 FRA Silver 567
## 4 GBR Bronze 620
## 5 GBR Gold 635
## 6 GBR Silver 729
## 7 GER Bronze 649
## 8 GER Gold 592
## 9 GER Silver 538
## 10 URS Bronze 596
## 11 URS Gold 832
## 12 URS Silver 635
## 13 USA Bronze 1197
## 14 USA Gold 2472
## 15 USA Silver 1333
g = ggplot(data=cleandata_add_Medal_top5,aes(NOC,fill=Medal))
g + geom_bar(aes(weight=medal_number),position = "dodge")+scale_fill_manual(values=c("darkred", "gold","gray40"))+ggtitle("Top 5 Medal winning NOCs")+labs(x="NOC",
y="Medal Number")

# I would recommend the first visualization because we can see some interesting pattern in the graph, as time goes by, in USA, the gap of medal numbers won by male and female is narrowing.
2.Medal Counts adjusted by Population, GDP
tath = drop_na(ath, Medal)
tath = tath[!duplicated(tath), ]
gdppop=gdppop[!duplicated(gdppop), ]
addvalue<- tath %>%
group_by(NOC) %>%
summarize(medal_number = length(Medal),gold_number = length(which(Medal=="Gold")),silver_number = length(which(Medal=="Silver")),bronze_number = length(which(Medal=="Bronze")),medal_value = 3*as.numeric(gold_number) + 2*as.numeric(silver_number)+as.numeric(bronze_number)) %>%
arrange(desc(.$medal_number))
#addvalue
fulltable= merge(x = addvalue, y = gdppop, by.x = "NOC", by.y = "Code", all = TRUE)
#fulltable
fulltable$medal_value_byGDP = (as.numeric(fulltable$medal_value)/as.numeric(fulltable$GDP.per.Capita))
fulltable$medal_value_byPOP = (as.numeric(fulltable$medal_value)/as.numeric(fulltable$Population))*1000000
fulltable$highlight <- fulltable$NOC == 'CHN'
fulltable <- fulltable %>%
mutate(highlight=replace(highlight, highlight == "TRUE", "CHN"), highlight=replace(highlight, highlight == "FALSE", "OTHER"))
Ranking_plot1=ggplot(fulltable, aes(x = GDP.per.Capita, y = medal_value)) +
geom_point(aes(colour = highlight)) +
scale_colour_manual(values = c("OTHER" = "black", "CHN" = "red"))+
labs(x="GDP.per.Capita",
y="Unadjusted Medal Value",
color="NOC") +
ggtitle("CHN's Ranking in Unadjusted Medal Value")
Ranking_plot1
## Warning: Removed 99 rows containing missing values (geom_point).

Ranking_plot2=ggplot(fulltable, aes(x = GDP.per.Capita, y = medal_value_byGDP)) +
geom_point(aes(colour = highlight)) +
scale_colour_manual(values = c("OTHER" = "black", "CHN" = "red"))+
labs(x="GDP.per.Capita",
y="Medal Value Adjusted by GDP",
color="NOC") +
ggtitle("CHN's Ranking in Adjusted Medal Value(by GDP)")
Ranking_plot2
## Warning: Removed 99 rows containing missing values (geom_point).

Ranking_plot3=ggplot(fulltable, aes(x = GDP.per.Capita, y = medal_value_byPOP)) +
geom_point(aes(colour = highlight)) +
scale_colour_manual(values = c("OTHER" = "black", "CHN" = "red"))+
labs(x="GDP.per.Capita",
y="Medal Value Adjusted by Population",
color="NOC") +
ggtitle("CHN's Ranking in Adjusted Medal Value(by Population)")
Ranking_plot3
## Warning: Removed 99 rows containing missing values (geom_point).

library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(Ranking_plot1, Ranking_plot2, Ranking_plot3,nrow=3)
## Warning: Removed 99 rows containing missing values (geom_point).
## Warning: Removed 99 rows containing missing values (geom_point).
## Warning: Removed 99 rows containing missing values (geom_point).

## From the comparison of 3 plots we can see that CHN ranks top when medal_value is adjusted by GDP, and ranks very low when medal_value is adjusted by population, which shows that CHN has a really large population.
3.Host Country Advantage
library(rvest)
## Loading required package: xml2
library(stringr)
wiki_hosts <- read_html("https://en.wikipedia.org/wiki/Summer_Olympic_Games")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[8]], fill=TRUE)
hosts <- hosts[-1,1:3]
hosts$city <- str_split_fixed(hosts$Host, n=2, ",")[,1]
hosts$country <- str_split_fixed(hosts$Host, n=2, ", ")[,2]
tathsummer_host <- tathsummer[which(tathsummer$NOC =='FRA'|tathsummer$NOC =='USA'|tathsummer$NOC == 'GBR'|tathsummer$NOC =='SWE'|tathsummer$NOC =='BEL'|tathsummer$NOC =='NED'|tathsummer$NOC =='GER'|tathsummer$NOC =='GBR'|tathsummer$NOC =='FIN'|tathsummer$NOC =='ANZ'|tathsummer$NOC =='ITA'|tathsummer$NOC =='JPN'|tathsummer$NOC =='MEX'|tathsummer$NOC =='CAN'|tathsummer$NOC =='URS'|tathsummer$NOC =='KOR'|tathsummer$NOC =='ESP'|tathsummer$NOC =='GRE'|tathsummer$NOC =='CHN'|tathsummer$NOC =='BRA'),]
#tathsummer_host
tathsummer_host_addyear<- tathsummer_host %>%
group_by(NOC,Year) %>%
summarize(medal_number = length(Medal))
#tathsummer_host_addyear
tathsummer_host_group<- tathsummer_host %>%
group_by(NOC) %>%
summarize(medal_number = length(Medal))
#tathsummer_host_group
NOC = c("ANZ","ANZ","BEL","BEL","BRA","BRA","CAN","CAN","CHN","CHN","ESP","ESP","FIN","FIN","FRA","FRA","GBR","BGR","GER","GER","GRE","GRE","ITA","ITA","JPN","JPN","KOR","KOR","MEX","MEX","NED","NED","SWE","SWE","URS","URS","USA","USA")
Host = c("true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false","true","false")
Yearly_Average_MedalNumber = c(0,1.1,188,9.9,50,15.7,23,26.6,184,26.9,69,15.5,40,16.1,172.5,49.3,247,57.3,224,91.4,39.5,5.6,88,50.3,62,29.2,77,17.6,9,3.7,57,31.9,190,34,442,60,298.5,158.7)
df = data.frame(NOC,Host,Yearly_Average_MedalNumber)
df
## NOC Host Yearly_Average_MedalNumber
## 1 ANZ true 0.0
## 2 ANZ false 1.1
## 3 BEL true 188.0
## 4 BEL false 9.9
## 5 BRA true 50.0
## 6 BRA false 15.7
## 7 CAN true 23.0
## 8 CAN false 26.6
## 9 CHN true 184.0
## 10 CHN false 26.9
## 11 ESP true 69.0
## 12 ESP false 15.5
## 13 FIN true 40.0
## 14 FIN false 16.1
## 15 FRA true 172.5
## 16 FRA false 49.3
## 17 GBR true 247.0
## 18 BGR false 57.3
## 19 GER true 224.0
## 20 GER false 91.4
## 21 GRE true 39.5
## 22 GRE false 5.6
## 23 ITA true 88.0
## 24 ITA false 50.3
## 25 JPN true 62.0
## 26 JPN false 29.2
## 27 KOR true 77.0
## 28 KOR false 17.6
## 29 MEX true 9.0
## 30 MEX false 3.7
## 31 NED true 57.0
## 32 NED false 31.9
## 33 SWE true 190.0
## 34 SWE false 34.0
## 35 URS true 442.0
## 36 URS false 60.0
## 37 USA true 298.5
## 38 USA false 158.7
Adv_plot=ggplot(df, aes(x = NOC, y = Yearly_Average_MedalNumber )) +
geom_point(aes(colour = Host)) +
ggtitle("Host Country Advantage")
Adv_plot

4. Most successful athletes
namedata <- tath %>%
group_by(Name,Medal,Sex) %>%
summarize(medal_number = length(Medal),gold_number = length(which(Medal=="Gold")),silver_number = length(which(Medal=="Silver")),bronze_number = length(which(Medal=="Bronze"))) %>%
arrange(desc(.$medal_number))
#namedata
## I define "most successful athletes"as athletes who won most gold medals, let's look at top 10 gold medal winners and see the gender distribution
namedata_top10 = namedata[1:10,]
namedata_top10
## # A tibble: 10 x 7
## # Groups: Name, Medal [10]
## Name Medal Sex medal_number gold_number silver_number bronze_number
## <fct> <fct> <fct> <int> <int> <int> <int>
## 1 Michae… Gold M 23 23 0 0
## 2 "Raymo… Gold M 10 10 0 0
## 3 "Frede… Gold M 9 9 0 0
## 4 Larysa… Gold F 9 9 0 0
## 5 Mark A… Gold M 9 9 0 0
## 6 Paavo … Gold M 9 9 0 0
## 7 Birgit… Gold F 8 8 0 0
## 8 "Jenni… Gold F 8 8 0 0
## 9 "Matth… Gold M 8 8 0 0
## 10 Ole Ei… Gold M 8 8 0 0
g2 = ggplot(data=namedata_top10,aes(Name,fill=Sex))
g2 + geom_bar(aes(weight=gold_number),position = position_stack(reverse = TRUE))+coord_flip()+ggtitle("Top 10 Gold-Medal winning Atheletes")+labs(x="Name",
y="Gold Medal Number") +theme(legend.position = "top")

namedata_addsport_year <- tath %>%
group_by(Name,Sport,Year) %>%
summarize(medal_number = length(Medal),gold_number = length(which(Medal=="Gold"))) %>%
arrange(desc(.$gold_number))
#namedata_addsport_year
namedata_addsport_year_top10 <- namedata_addsport_year[which(namedata_addsport_year$Name == 'Michael Fred Phelps, II'|namedata_addsport_year$Name == 'Raymond Clarence "Ray" Ewry'|namedata_addsport_year$Name == 'Frederick Carlton "Carl" Lewis'|namedata_addsport_year$Name == 'Larysa Semenivna Latynina (Diriy-)'|namedata_addsport_year$Name == 'Mark Andrew Spitz'|namedata_addsport_year$Name == 'Paavo Johannes Nurmi'|namedata_addsport_year$Name == 'Birgit Fischer-Schmidt'|namedata_addsport_year$Name == 'Jennifer Elisabeth "Jenny" Thompson (-Cumpelik)'|namedata_addsport_year$Name == 'Matthew Nicholas "Matt" Biondi'|namedata_addsport_year$Name == 'Ole Einar Bjrndalen'),]
p2 <- ggplot(data=namedata_addsport_year_top10,
aes(x=Year,
y=gold_number))
p2 + geom_line(aes(group=Name,
color=Sport)) +
labs(x="Year",
y="Gold Medal Number",
color="Sport") + geom_point(size=4,aes(color=Sport)) + ggtitle("Top 10 Gold-Medal winning Atheletes' Sports")

## For this graph, every tiny line is an athlete, my interesting finding is that before 1925, top athletes are crazy about winning medals in Atheletics, and later on, the popular sport became swimming
5.Make two plots interactive
library(devtools)
devtools::install_github("ropensci/plotly",force=TRUE)
## Downloading GitHub repo ropensci/plotly@master
## from URL https://api.github.com/repos/ropensci/plotly/zipball/master
## Installing plotly
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file \
## --no-environ --no-save --no-restore --quiet CMD INSTALL \
## '/private/var/folders/rx/w35t6lg902l19fdtv5wqjfgr0000gn/T/Rtmpqu1upQ/devtools7c0319d4b1f/ropensci-plotly-c05f001' \
## --library='/Library/Frameworks/R.framework/Versions/3.5/Resources/library' \
## --install-tests
##
R.home(component = "home")
## [1] "/Library/Frameworks/R.framework/Resources"
#install.packages("usethis")
library(usethis)
##
## Attaching package: 'usethis'
## The following objects are masked from 'package:devtools':
##
## use_appveyor, use_build_ignore, use_code_of_conduct,
## use_coverage, use_cran_badge, use_cran_comments, use_data,
## use_data_raw, use_dev_version, use_git, use_git_hook,
## use_github, use_github_links, use_gpl3_license,
## use_mit_license, use_news_md, use_package, use_package_doc,
## use_rcpp, use_readme_md, use_readme_rmd, use_revdep,
## use_rstudio, use_test, use_testthat, use_travis, use_vignette
usethis::edit_r_environ()
## ● Edit /Users/luqi/.Renviron
## ● Restart R for changes to take effect
Sys.setenv("plotly_username"="luqi.chen")
Sys.setenv("plotly_api_key"="R9LBpABPVy7aUJ3Sx7jf")
interation_1=p + geom_line(aes(group=Sex,
color=Sex))+
labs(x="Year",
y="Gold Medal Number",
color="Gender") +
ggtitle("USA's Gold Medal Number by Sex")+ geom_point(size=2,aes(color=Sex))
#install.packages('plotly')
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplotly(interation_1)
## In this interation graph, readers can easily get the statistic of gold number and year by pointing to each spot.
interation_2 = g + geom_bar(aes(weight=medal_number),position = "dodge")+scale_fill_manual(values=c("darkred", "gold","gray40"))+ggtitle("Top 5 Medal winning NOCs")+labs(x="NOC",
y="Medal Number")
ggplotly(interation_2)
# In this interation graph, readers can easily read the specific number of medals by pointing to each bar.
6.Data Table
#install.packages('DT')
library(DT)
datatable(cleandata_add_Medal_top5)
library(stringr)
pretty_headers <-
gsub("[.]", " ", colnames(cleandata_add_Medal_top5)) %>%
str_to_title()
cleandata_add_Medal_top5 %>%
datatable(
rownames = FALSE,
colnames = pretty_headers,
filter = list(position = "top"),
options = list(language = list(sSearch = "Filter:"))
)
## In this datatable, I can provide the medal information for a particular NOC(by using the column filter of Noc), I can also provide how gold medals are distributed in the top 5 Medal Winning NOCs(by using the column filter of Medal)